Ripunjoy Gohain
ripunjoygohain79@gmail.com
Data Information:
# coding: utf-8
# !/usr/bin/env python3
import sys
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('float_format', '{:4f}'.format)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
print("Python Version: ", sys.version)
print("Pandas Version: ", pd.__version__)
print("Numpy Version: ", np.__version__)
print("Seaborn Version: ", sns.__version__)
# dataset is txt form, deliminated with ";", reading the dataset variables as string so setting the low_memory to false
df = pd.read_csv("./data/household_power_consumption.zip", compression="zip", sep=";", low_memory=False, header=0,
infer_datetime_format=True, parse_dates={"local_time": [0, 1]}, index_col=["local_time"])
# converting the column headers to small letters
df.columns = [x.lower() for x in df.columns]
# missing values are represented as "?"
df.replace(to_replace="?", value=np.nan, inplace=True)
# convert all values to float32
df = df.astype(np.float32)
df.head(10)
print("Shape of the data frame: ", df.shape)
# Calculating expected number of index for 1 minutes frequency in the provided time period
print("Expected number of index(data) in the time range: {:.0f}".format((df.index.max()-df.index.min()).total_seconds() / (60) + 1))
print("Data start time: ", df.index.min())
print("Data end time: ", df.index.max())
print("Total time period: ", df.index.max()-df.index.min())
df.info()
df.describe()
# Missing value statistics
print("Column wise missing values: ")
df.isna().sum()
# If we drop row if all of the values are missing then remaining shape will be: Just calculating, not assigning to the dataframe
df.dropna(axis=0, how="all").shape
# Let's check if this difference of dropping any is matching indidividual missing value count or not
df.shape[0] - df.dropna(axis=0, how="any").shape[0]
# From above, as it's matching we can say that, if there if one tag is missing, then all other tags are missing, missing value
# is may not from some specific sensor but feels like communication erro5
print("Missing values percentage: {:.2f} %".format((df.shape[0] - df.dropna(axis=0, how="any").shape[0]) * 100 / df.shape[0]))
# let's check if missing values are from random timestamps or it occurs for longer time interval
missing_stat = df.dropna().index.to_series().diff().dropna().dt.total_seconds().div(60)
# Here actually we are calculating time difference, so diff 1 minutes means no missing value
# 2 means, 1 missing value before some particular time stamp, slly 10 means 9 missing values before that particular time
missing_stat.describe()
print("Count of missing timestamps instances: ", missing_stat[missing_stat >= 2].shape)
missing_stat[missing_stat >= 2].sort_values(ascending=False).head(10)
missing_stat[missing_stat >= 2].plot(figsize=(10,5), title="Missing values time range");
print("Consecutive missing values can go upto: {:.2f} days".format((missing_stat.max() - 1) / (60*24)))
Due to time constraints, I am not going to experiment with these methods. We need to create a fuzzy logic based on the period of missing values To keep it simple, I will try to just fill the values by looking previous days same time power consumption. For longer time periods (like 5 days), It will just repeat for all the days. Basically the data will be filled up with last available time stamps data
def fill_missing_values_simple(dataFrame):
"""
Input: dataFrame
Simple approach to fill missing values
If today some value is missing, look for yesterdays same time data and fill it up with that
Output: dataFrame
"""
one_day_minutes = 60 * 24
for row in range(dataFrame.values.shape[0]):
for col in range(dataFrame.values.shape[1]):
if np.isnan(dataFrame.values[row, col]):
dataFrame.values[row, col] = dataFrame.values[row - one_day_minutes, col]
return dataFrame
df = fill_missing_values_simple(dataFrame=df)
df.shape
# Verify if missing values are being filled
df.isna().sum()
df.head()
[x for x in df.columns if "metering" in x]
# global_active_power is in kw, sub metering is in wH
# We will get other part energy being consumed by converting global_active_power to energy and subtracting all other sub metering energy
df['sub_metering_other'] = (df.global_active_power * 1000 / 60) - (df[[x for x in df.columns if "metering" in x]].sum(axis=1))
df.sub_metering_other.describe()
df[df.sub_metering_other<0].shape
We expected that global_active_power (converted to energy) should be equal or lesser than all 3 energy meters combined readings.
But here the negative values indicating that which is not true. And this is very unlikely. Though 1K records out of 2 million records is very less, we may be able to treat this as outliers or further extend the analysis to understand this. It may be because of fauly sensor readings.
For the time being, we will change this negative value to ZERO
If time permits, we will try to understand this particular scenario in detail.
df.loc[df.sub_metering_other<0, "sub_metering_other"] = 0
df.describe()
# Saving the dataframe for later use
df.to_csv("./data/cleaned_household_power_consumption.csv")
# Getting
df["year"] = df.index.year
df["quarter"] = df.index.quarter
df["month"] = df.index.month
df["day"] = df.index.day
# Monday 0, Sunday 6
df["weekday"]=df.index.weekday
df["weekday"] = (df["weekday"] < 5).astype(int)
cols = ["global_active_power", "global_reactive_power", "voltage", "global_intensity", "sub_metering_1", "sub_metering_2",
"sub_metering_3", "sub_metering_other"]
for col in cols:
sns.distplot(df[col])
plt.tight_layout()
plt.savefig("./plots/" + "distribution_" + col + ".png")
plt.show()
KURTOSIS: describes heaviness of the tails of data distribution
Normal Distribution has a kurtosis of close to 0. If the kurtosis is greater than zero, then distribution has heavier tails. If the kurtosis is less than zero, then the distribution is light tails.
SKEWNESS: describes the symmetry of the data distribution
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical. If the skewness is between -1 and – 0.5 or between 0.5 and 1, the data are moderately skewed. If the skewness is less than -1 or greater than 1, the data are highly skewed.
for col in cols:
print("\n*** Normality Test for: {} ***".format(col))
print("--"*30)
stat, p = stats.normaltest(df[col])
print("Statistics(z-score for kurtosis & skewness): {:.3f}, pvalue: {:.3f}".format(stat, p))
alpha = 0.05
if p > alpha:
print("{} data looks Normal: (fail to reject H0)".format(col))
else:
print("{} data doesn't looks Normal: (reject H0)".format(col))
print( 'Kurtosis of normal distribution: {}'.format(stats.kurtosis(df[col])))
print( 'Skewness of normal distribution: {}'.format(stats.skew(df[col])))
In the next Notebook interactive plots being created using bokeh so that we can drill down(Note: In GitHub we will get the source code, but not the plots. It needs a server or needs to run in Local)
for col in cols:
df[col].plot(title=col + " (minute)", figsize=(15,5))
plt.tight_layout()
plt.savefig("./plots/" + "minutely_" + col + ".png")
plt.show()
# hourly plots
for col in cols:
df[col].resample("H").mean().plot(title=col + " (hour)", figsize=(15,5))
plt.tight_layout()
plt.savefig("./plots/" + "hourly_" + col + ".png")
plt.show()
# Daily plots
for col in cols:
df[col].resample("D").mean().plot(title=col + " (day)", figsize=(15,5))
plt.tight_layout()
plt.savefig("./plots/" + "daily_" + col + ".png")
plt.show()
for col in cols:
df[col].resample("W").mean().plot(title=col + " (week)", figsize=(15,5))
plt.tight_layout()
plt.savefig("./plots/" + "weekly_" + col + ".png")
plt.show()
# For better view lets look at Monthly plots
for col in cols:
df[col].resample("M").mean().plot(title=col + " (month)", figsize=(15,5))
plt.tight_layout()
plt.savefig("./plots/" + "monthly_" + col + ".png")
plt.show()
dic={0: "Weekend", 1: "Weekday"}
df["weekday_string"] = df.weekday.map(dic)
for col in cols:
plt.figure(figsize=(10,5))
sns.boxplot("year", col, hue="weekday_string", data=df, width=0.8, fliersize=3)
plt.tight_layout()
plt.savefig("./plots/" + "box_weekday_" + col + ".png")
plt.show()
for col in cols:
plt.figure(figsize=(15,5))
sns.boxplot("year", col, hue="month", data=df, width=0.8, fliersize=3)
plt.tight_layout()
plt.savefig("./plots/" + "box_month_" + col + ".png")
plt.show()
for col in cols:
plt.figure(figsize=(10,5))
sns.boxplot("year", col, hue="quarter", data=df, width=0.8, fliersize=3)
plt.tight_layout()
plt.savefig("./plots/" + "box_quarter_" + col + ".png")
plt.show()
for col in cols:
plt.figure(figsize=(10,5))
sns.factorplot("year", col, hue="weekday_string", data=df, fliersize=3)
plt.tight_layout()
plt.savefig("./plots/" + "factor_weekday_" + col + ".png")
plt.show()
for col in cols:
plt.figure(figsize=(10,5))
sns.factorplot("year", col, hue="quarter", data=df, fliersize=3)
plt.tight_layout()
plt.savefig("./plots/" + "factor_quarter_" + col + ".png")
plt.show()
for col in cols:
plt.figure(figsize=(10,5))
sns.factorplot("month", col, hue="year", data=df, fliersize=3)
plt.tight_layout()
plt.savefig("./plots/" + "factor_month_year_" + col + ".png")
plt.show()
# minute wise data points
plt.figure(figsize=(10,8))
sns.heatmap(df[cols].corr(method="pearson"), vmin=-1, vmax=1, center=0, annot=True, fmt=".2f", linewidths=0.2);
plt.savefig("./plots/corr_heatmap_minute.png")
# hour wise data points
plt.figure(figsize=(10,8))
sns.heatmap(df[cols].resample("H").mean().corr(method="pearson"), vmin=-1, vmax=1, center=0, annot=True, fmt=".2f", linewidths=0.2);
plt.savefig("./plots/corr_heatmap_hour.png")
# day wise data points
plt.figure(figsize=(10,8))
sns.heatmap(df[cols].resample("D").mean().corr(method="pearson"), vmin=-1, vmax=1, center=0, annot=True, fmt=".2f", linewidths=0.2);
plt.savefig("./plots/corr_heatmap_day.png")
# month wise data points
plt.figure(figsize=(10,8))
sns.heatmap(df[cols].resample("M").mean().corr(method="pearson"), vmin=-1, vmax=1, center=0, annot=True, fmt=".2f", linewidths=0.2);
plt.savefig("./plots/corr_heatmap_day.png")